{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Hyperparameter Optimization Tutorial\n",
    "\n",
    "This tutorial will show you the main hyper-parameters for LearningWithNoisyLabels. There are only two!\n",
    "\n",
    "1. `prune_method` : str (default: `'prune_by_noise_rate'`), Method used for pruning.\n",
    "    * Values: [`'prune_by_class'`, `'prune_by_noise_rate'`, or `'both'`]. \n",
    "    * `'prune_by_noise_rate'`: works by removing examples with *high probability* of being mislabeled for every non-diagonal in the prune_counts_matrix (see pruning.py).\n",
    "    * `'prune_by_class'`: works by removing the examples with *smallest probability* of belonging to their given class label for every class.\n",
    "    * `'both'`: Finds the examples satisfying (1) AND (2) and removes their set conjunction. \n",
    "\n",
    "\n",
    "2. converge_latent_estimates : bool (Default: False)\n",
    "    * If true, forces numerical consistency of latent estimates. Each is estimated independently, but they are related mathematically with closed form  equivalences. This will iteratively enforce mathematically consistency.\n",
    "\n",
    "## This tutorial uses hypopt for faster hyper-optimization using a validation set (instead of slow cross validation).\n",
    "### `$ pip install hypopt`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Python 2 and 3 compatibility\n",
    "from __future__ import print_function, absolute_import, division, unicode_literals, with_statement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from hypopt.model_selection import GridSearch\n",
    "from cleanlab.classification import LearningWithNoisyLabels\n",
    "from cleanlab.noise_generation import generate_noise_matrix_from_trace\n",
    "from cleanlab.noise_generation import generate_noisy_labels\n",
    "from cleanlab.util import print_noise_matrix\n",
    "from sklearn.datasets import make_classification\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "import numpy as np\n",
    "import copy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_linear_dataset(n_classes = 3, n_samples = 300):\n",
    "    X, y = make_classification(n_samples = n_samples, n_features=2, n_redundant=0, n_informative=2,\n",
    "                           random_state=1, n_clusters_per_class=1, n_classes=n_classes)\n",
    "    rng = np.random.RandomState(2)\n",
    "    X += 2 * rng.uniform(size=X.shape)\n",
    "    return (X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "param_grid = {\n",
    "    'prune_method': ['prune_by_class', 'prune_by_noise_rate', 'both'],\n",
    "    'converge_latent_estimates': [True, False],\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set the sparsity of the noise matrix.\n",
    "frac_zero_noise_rates = 0.0 # Consider increasing to 0.5\n",
    "# A proxy for the fraction of labels that are correct.\n",
    "avg_trace = 0.65 # ~35% wrong labels. Increasing makes the problem easier.\n",
    "# Amount of data for each dataset.\n",
    "dataset_size = 250 # Try 250 or 400 to use less or more data.\n",
    "num_classes = 3\n",
    "\n",
    "ds = make_linear_dataset(n_classes=num_classes, n_samples=num_classes*dataset_size)\n",
    "X, y = ds\n",
    "X = StandardScaler().fit_transform(X)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=0)\n",
    "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.25, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " =========== \n",
      " Naive Bayes \n",
      " ===========\n",
      "\n",
      " Noise Matrix (aka Noisy Channel) P(s|y) of shape (3, 3)\n",
      " p(s|y)\ty=0\ty=1\ty=2\n",
      "\t---\t---\t---\n",
      "s=0 |\t0.52\t0.1\t0.34\n",
      "s=1 |\t0.2\t0.82\t0.05\n",
      "s=2 |\t0.28\t0.07\t0.61\n",
      "\tTrace(matrix) = 1.95\n",
      "\n",
      "Comparing 6 parameter setting(s) using 4 CPU thread(s) ( 1 job(s) per thread ).\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 6/6 [00:00<00:00,  9.12it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy with default parameters: 0.7\n",
      "Accuracy with optimized parameters: 0.73\n",
      "\n",
      "Optimal parameter settings using Naive Bayes\n",
      "--------------------------------------------\n",
      "converge_latent_estimates : False\n",
      "prune_method : both\n",
      "\n",
      " =================== \n",
      " Logistic Regression \n",
      " ===================\n",
      "\n",
      " Noise Matrix (aka Noisy Channel) P(s|y) of shape (3, 3)\n",
      " p(s|y)\ty=0\ty=1\ty=2\n",
      "\t---\t---\t---\n",
      "s=0 |\t0.52\t0.1\t0.34\n",
      "s=1 |\t0.2\t0.82\t0.05\n",
      "s=2 |\t0.28\t0.07\t0.61\n",
      "\tTrace(matrix) = 1.95\n",
      "\n",
      "Comparing 6 parameter setting(s) using 4 CPU thread(s) ( 1 job(s) per thread ).\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 6/6 [00:00<00:00,  8.47it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy with default parameters: 0.69\n",
      "Accuracy with optimized parameters: 0.71\n",
      "\n",
      "Optimal parameter settings using Logistic Regression\n",
      "----------------------------------------------------\n",
      "converge_latent_estimates : False\n",
      "prune_method : both\n"
     ]
    }
   ],
   "source": [
    "for name, clf in [\n",
    "    (\n",
    "        \"Naive Bayes\", \n",
    "        GaussianNB(),\n",
    "    ),\n",
    "    (\n",
    "        \"Logistic Regression\", \n",
    "         LogisticRegression(random_state=0, solver = 'lbfgs', multi_class = 'auto'),\n",
    "    ),\n",
    "]:\n",
    "    print(\"\\n\", \"=\"*len(name), \"\\n\", name, '\\n', \"=\"*len(name))\n",
    "    np.random.seed(seed=0)\n",
    "    clf_copy = copy.deepcopy(clf)\n",
    "    # Compute p(y=k), the ground truth class prior on the labels.\n",
    "    py = np.bincount(y_train) / float(len(y_train))\n",
    "    # Generate the noisy channel to characterize the label errors.\n",
    "    noise_matrix = generate_noise_matrix_from_trace(\n",
    "        K = num_classes,\n",
    "        trace = num_classes * avg_trace, \n",
    "        py = py,\n",
    "        frac_zero_noise_rates = frac_zero_noise_rates,\n",
    "    )\n",
    "    print_noise_matrix(noise_matrix)\n",
    "    # Create the noisy labels. This method is exact w.r.t. the noise_matrix.\n",
    "    y_train_with_errors = generate_noisy_labels(y_train, noise_matrix)\n",
    "    lnl_cv = GridSearch(\n",
    "        model=LearningWithNoisyLabels(clf),\n",
    "        param_grid=param_grid,\n",
    "        num_threads=4,\n",
    "        seed=0,\n",
    "        parallelize=True,\n",
    "    )\n",
    "    lnl_cv.fit(\n",
    "        X_train = X_train, \n",
    "        y_train = y_train_with_errors,\n",
    "        X_val = X_val,\n",
    "        y_val = y_val,\n",
    "        verbose = True,\n",
    "    )\n",
    "    # Also compute the test score with default parameters\n",
    "    clf_copy.fit(X_train, y_train_with_errors)\n",
    "    score_opt = lnl_cv.model.score(X_test, y_test)\n",
    "    score_default = clf_copy.score(X_test, y_test)\n",
    "    print(\"Accuracy with default parameters:\", np.round(score_default, 2))\n",
    "    print(\"Accuracy with optimized parameters:\", np.round(score_opt, 2))\n",
    "    print()\n",
    "    s = \"Optimal parameter settings using {}\".format(name)\n",
    "    print(s)\n",
    "    print(\"-\"*len(s))\n",
    "    for key in lnl_cv.best_params.keys():\n",
    "        print(key, \":\", lnl_cv.best_params[key])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
